{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import gffutils\n",
    "import gzip\n",
    "from Bio import Alphabet, Seq, SeqIO"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Retrieving data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "--2020-12-04 12:07:47--  ftp://ftp.vectorbase.org/public_data/organism_data/agambiae/Genome/agambiae.CHROMOSOMES-PEST.AgamP3.fa.gz\n",
      "           => ‘gambiae.fa.gz’\n",
      "Resolving ftp.vectorbase.org (ftp.vectorbase.org)... 129.74.255.228\n",
      "Connecting to ftp.vectorbase.org (ftp.vectorbase.org)|129.74.255.228|:21... connected.\n",
      "Logging in as anonymous ... Logged in!\n",
      "==> SYST ... done.    ==> PWD ... done.\n",
      "==> TYPE I ... done.  ==> CWD (1) /public_data/organism_data/agambiae/Genome ... done.\n",
      "==> SIZE agambiae.CHROMOSOMES-PEST.AgamP3.fa.gz ... 81591806\n",
      "==> PASV ... done.    ==> RETR agambiae.CHROMOSOMES-PEST.AgamP3.fa.gz ... done.\n",
      "Length: 81591806 (78M) (unauthoritative)\n",
      "\n",
      "agambiae.CHROMOSOME 100%[===================>]  77.81M  2.42MB/s    in 32s     \n",
      "\n",
      "2020-12-04 12:08:20 (2.41 MB/s) - ‘gambiae.fa.gz’ saved [81591806]\n",
      "\n"
     ]
    }
   ],
   "source": [
    "!rm -rf ag.db gambiae.fa.gz 2>/dev/null\n",
    "!wget ftp://ftp.vectorbase.org/public_data/organism_data/agambiae/Genome/agambiae.CHROMOSOMES-PEST.AgamP3.fa.gz -O gambiae.fa.gz"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "!rm -f ag.db\n",
    "\n",
    "db = gffutils.create_db('https://vectorbase.org/common/downloads/Pre-VEuPathDB%20VectorBase%20files/Anopheles-gambiae-PEST_BASEFEATURES_AgamP4.2.gff3.gz', 'ag.db')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Getting a gene"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "gene_id = 'AGAP004707'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "gene = db[gene_id]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "2L\tVectorBase\tgene\t2358158\t2431617\t.\t+\t.\tID=AGAP004707;biotype=protein_coding\n",
      "2L +\n"
     ]
    }
   ],
   "source": [
    "print(gene)\n",
    "print(gene.seqid, gene.strand)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "chromosome:AgamP3:2L:1:49364325:1 chromosome 2L\n",
      "SingleLetterAlphabet()\n"
     ]
    }
   ],
   "source": [
    "recs = SeqIO.parse(gzip.open('gambiae.fa.gz', 'rt', encoding='utf-8'), 'fasta')\n",
    "for rec in recs:\n",
    "    print(rec.description)\n",
    "    if rec.description.split(':')[2] == gene.seqid:\n",
    "        my_seq = rec.seq\n",
    "        break\n",
    "print(my_seq.alphabet)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_sequence(chrom_seq, CDSs, strand):\n",
    "    seq = Seq.Seq('', alphabet=Alphabet.IUPAC.unambiguous_dna)\n",
    "    for CDS in CDSs:\n",
    "        #FRAME???\n",
    "        my_cds = Seq.Seq(str(chrom_seq[CDS.start - 1: CDS.end]), alphabet=Alphabet.IUPAC.unambiguous_dna)\n",
    "        seq += my_cds\n",
    "    return seq if strand == '+' else seq.reverse_complement()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "AGAP004707-RA\n",
      "6357 ATGACCGAAGACTCCGATTCGATATCTGAGGAAGAACGTAGTTTGTTCCGTCCTTTCACTCGTGAATCATTACAAGCTATCGAAGCACGCATTGCAGATGAAGAAGCCAAACAGCGAGAATTGGAAAGAAAACGAGCTGAGGGGGAGGATGAGGATGAAGGTCCCCAACCGGACCCTACTCTTGAACAGGGTGTACCAGTCCCAGTTCGAATGCAAGGCAGCTTCCCCCCGGAGTTGGCCTCCACGCCTCTCGAGGATATTGACAGTTTCTATTCAAATCAAAGGACATTCGTAGTGATTAGTAAAGGAAAAGATATATTTCGTTTCTCCGCAACTAACGCATTATATGTACTTGATCCGTTTAACCCCATACGCCGCGTAGCTATTTATATTTTAGTACATCCACTGTTTTCACTTTTTATAATAACGACCATTCTTGTTAATTGTATATTGATGATTATGCCTACCACGCCGACAGTCGAATCTACCGAGGTGATATTCACCGGCATCTACACGTTCGAATCAGCTGTAAAAGTGATGGCGCGAGGTTTCATATTACAACCGTTTACTTATCTTAGAGATGCATGGAATTGGTTGGACTTCGTAGTAATAGCATTAGCATATGTAACTATGGGTATAGATTTGGGTAATCTTGCTGCGTTGAGAACATTCAGGGTATTACGAGCTCTTAAAACGGTAGCCATCGTTCCAGGCTTAAAAACCATCGTCGGAGCCGTTATAGAATCCGTAAAGAATCTCAGAGATGTGATAATTTTAACAATGTTTTCGTTATCCGTGTTTGCTTTGATGGGTCTACAAATCTACATGGGAGTACTAACACAAAAGTGCATAAAAGAGTTCCCATTGGATGGTTCCTGGGGTAATCTAACCGACGAAAGCTGGGAGCTGTTCAACAGCAATGACACAAATTGGTTCTATTCCGAGAGTGGCGACATTCCTCTTTGTGGAAACTCATCTGGAGCTGGACAATGTGATGAAGGCTACATTTGTTTACAAGGCTATGGCAAAAATCCAAATTACGGGTATACAAGTTTTGATACATTCGGATGGGCATTCTTGTCTGCCTTTCGTCTAATGACTCAAGATTATTGGGAGAATTTATATCAACTGGTGTTACGATCAGCTGGACCGTGGCACATGCTCTTCTTCATTGTGATTATCTTCTTGGGTTCGTTTTACCTTGTAAATTTAATCTTGGCCATTGTCGCCATGTCGTACGACGAACTCCAGAAGAAGGCCGAAGAGGAAGAGGCCGCCGAGGAAGAAGCACTTCGGGAAGCAGAAGAAGCTGCGGCAGCAAAGGCAGCTAAACTGGAAGCACAACAAGCGGCCGCAGCAGCAGCGGCGAACCCAGAAATCGCTAAAAGCCCTTCGGATTTCTCCTGTCACAGCTATGAGTTGTTTGTCGGACAGGAGAAAGGCAACGACGATAACAATAAGGAGAAAATGTCCATTAGAAGCGAAGGATTGGAGTCGGTGAGCGAAATCACAAGAACAACCGCACCAACAGCTACTGCAGCTGGCACTGCAAAAGCCCGTAAAGTGAGCGCGGCTTCACTTTCATTACCCGGTTCACCATTTAATCTTCGTAGAGGATCTAGAGGATCACACCAGTTCACGATACGTAACGGTAGAGGACGTTTCGTTGGTGTACCTGGTAGCGATAGAAAACCACTGGTACTATCAACATATCTCGATGCACAAGAACACCTGCCATACGCTGATGATTCCAACGCGGTAACGCCGATGTCGGAAGAAAATGGTGCAATCATCGTTCCAGTATACTATGCTAATTTAGGTTCAAGGCACTCGTCGTATACTTCGCATCAGTCGCGTATTTCGTACACATCTCACGGTGACCTGCTCGGGGGCATGACAAAAGAGAGCCGTCTGCGAAATCGATCAGCCCGTAACACTAACCATTCAATTGTACCACCTCCGAATGCTAACAATCTATCCTACGCTGAAACAAACCATAAAGGACAGCGAGATTTCGACTTGACACAGGACTGTACAGACGATGCCGGAAAAATAAAACATAACGACAATCCTTTCATAGAACCTGCTCAAACTCAAACTGTGGTAGATATGAAAGACGTGATGGTGTTAAATGACATTATTGAGCAAGCTGCTGGTCGGCACAGCAGGGCAAGTGATCACGGAGTCTCTGTTTATTACTTCCCCACAGAGGACGACGACGAAGATGGCCCAACGTTTAAGGACAAAGCCCTCGAGTTTCTGATGAAGATGATCGACATTTTCTGTGTGTGGGACTGTTGTTGGGTTTGGCTTAAATTTCAGGAATGGGTTGCTTTTATTGTGTTTGACCCATTTGTAGAGCTATTCATTACGCTCTGCATTGTGGTAAATACACTGTTTATGGCTTTAGATCATCACGATATGGATCCAGATATGGAAAAGGCACTGAAAAGTGGCAACTACTTCTTCACAGCTACATTTGCCATCGAAGCTACAATGAAGCTCATAGCAATGAGCCCCAAATATTACTTTCAAGAGGGTTGGAATATCTTCGATTTTATTATCGTAGCACTGTCTCTGCTAGAATTGGGACTTGAGGGTGTTCAAGGATTGTCAGTATTACGATCGTTCCGTTTGCTAAGAGTTTTCAAGCTGGCAAAATCATGGCCTACATTGAATCTTCTAATTTCCATCATGGGACGTACAGTTGGTGCCCTTGGTAATTTAACCTTCGTCTTATGCATCATCATTTTCATCTTCGCCGTGATGGGGATGCAACTTTTTGGCAAAAACTACACAGATAATGTGGATAGATTCCCCGACCATGATCTGCCAAGATGGAATTTTACAGATTTCATGCATTCCTTCATGATTGTGTTCCGTGTGCTATGCGGAGAATGGATTGAATCAATGTGGGATTGTATGCTTGTCGGTGATGTATCCTGCATACCATTTTTCTTGGCCACTGTAGTGATAGGAAATTTAGTCGTGCTTAACCTTTTCTTAGCCTTGCTTTTGTCAAATTTTGGTTCATCATCCTTGTCTGCACCAACGGCAGATAATGAGACCAACAAGATTGCAGAAGCGTTCAACAGAATATCACGCTTTTCTAACTGGATTAAAATGAATTTAGCAAACGCTCTCAAGTTTGTAAAAAATAAATTAACAAGCCAAATAGCATCCGTTCAACCGACAGGCAAAGGCGTATGTCCATGTATATCTTCAGAGCATGGTGAAAATGAACTGGAACTTACTCCAGACGATATTTTGGCGGATGGACTATTGAAGAAAGGAATCAAAGAGCACAACCAACTGGAAGTAGCGATTGGCGATGGCATGGAATTCACCATTCATGGTGATCTGAAGAACAAAGCAAAAAAGAATAAACAAATCATGAACAACTCTAAGGTGATAGGCAATTCTATTAGTAATCATCAAGATAATAAATTAGATCATGAACTGAATCATAGAGGCGTGTCCTTACAGGACGATGATACTGCTAGTATCAAATCTTATGGCAGTCACAAGAATCGCCCATTTAAGGATGAAAGCCACAAAGGCAGCGCCGAAACGATGGAGGGTGAAGAAAAACGTGATGCCAGCAAGGAGGATCTAGGAATTGACGAAGAACTCGACGACGAAGGCGAGGGAGATGAAGGTCCTCTGGATGGAGAGCTGATTATTCATGCAGAAGAAGACGAAGTGATTGAGGATTCACCGGCGGATTGCTGCCCGGACAACTGCTACAAAAAATTTCCTGTTCTTGCTGGGGATGATGACGCGCCGTTCTGGCAGGGTTGGGGAAATTTACGTCTCAAAACGTTTCAGCTAATAGAAAATAAGTATTTTGAGACAGCTGTAATTACAATGATTCTGCTTAGTAGCTTAGCTTTGGCCCTCGAAGATGTGCATCTTCCACAGCGCCCAATCCTTCAAGATATTCTTTATTACATGGATCGAATTTTCACAGTGATCttttttttAGAGATGTTAATCAAATGGTTAGCTTTAGGTTTTAAAGTATATTTTACAAATGCTTGGTGTTGGCTTGATTTCATTATCGTGATGGTATCTTTGATAAACTTCGTTGCTTCACTTTGTGGAGCTGGTGGTATTCAAGCATTCAAAACCATGCGAACTCTTAGAGCCCTGAGACCACTACGTGCCATGTCCCGTATGCAGGGAATGAGGGTCGTCGTGAATGCGTTGGTTCAAGCTATACCGTCCATCTTCAACGTGCTGCTGGTTTGTTTGATATTCTGGCTAATATTTGCAATCATGGGGGTGCAATTATTTGCTGGCAAATACTTCAAGTGTGTGGATAAAAATAAAACTACATTACCTCACGAAATTATACCGGATGTAAATGCTTGCAAAGCCGAAAACTATTCATGGGAAAATTCACCAATGAACTTCGATCATGTAGGTAAAGCATATTTGTGTCTGTTTCAAGTAGCCACATTCAAAGGATGGATACAAATAATGAACGATGCTATTGATTCTAGAGACGTAAGTTTTGTCGGTAAACAGCCTATACGGGAAACGAATATCTACATGTATCTGTACTTTGTGTTCTTTATTATCTTTGGGTCATTCTTCACGTTGAATCTATTCATTGGTGTTATAATTGACAACTTCAATGAACAGAAAAAGAAAGCTGGTGGATCGCTAGAAATGTTCATGACAGAGGATCAGAAAAAGTACTATAATGCAATGAAAAAAATGGGTTCGAAGAAACCTCTAAAGGCAATTCCTCGTCCAAGGTGGCGGCCTCAAGCAATAGTTTTTGAAATAGTGACGAACAAAAAGTTTGACATGATTATCATGTTGTTCATCGGATTCAATATGTTAACTATGACACTGGACCACTACAAACAATCAGAAACTTTTAGTGCTGTTTTGGATTACTTGAATATGATATTCATCTGCATATTCAGCAGCGAATGTTTAATGAAGATTTTTGCACTTCGTTATCATTACTTTATCGAGCCATGGAATTTGTTTGATTTTGTTGTCGTCATTCTTTCGATTTTGGGCCTTGTTCTAAGTGATATCATTGAAAAATATTTTGTATCTCCCACACTTCTACGAGTCGTGCGAGTGGCAAAAGTGGGCCGAGTATTGCGTTTGGTTAAAGGAGCCAAGGGTATCCGAACGTTGCTGTTTGCATTAGCAATGTCGCTACCTGCACTATTTAACATCTGCTTGTTACTCTTTTTGGTGATGTTTATATTTGCCATTTTTGGAATGTCATTTTTCATGCACGTCAAAGATAAGAGTGGCTTAGATGACGTGTACAATTTTAAAACGTTTGGCCAGAGCATGATTTTACTATTTCAGATGTCAACCTCGGCTGGGTGGGATGGTGTTTTAGATGGTATTATCAATGAAGAAGACTGTCTTCCACCAGACAATGATAAGGGCTATCCGGGAAATTGTGGTTCATCAACAATTGGCATAACGTACTTATTGGCGTATCTTGTAATAAGTTTCCTTATCGTTATTAACATGTACATTGCTGTTATCCTCGAAAACTACTCGCAAGCTACGGAAGATGTTCAAGAAGGCTTAACTGATGACGATTATGATATGTACTACGAAATATGGCAGCAATTCGATCCTGACGGTACACAATACGTTCGATATGATCAGCTATCAGACTTTTTGGATGTGCTGGAACCGCCTCTACAGATTCATAAACCAAATCGTTATAAGATTATTTCGATGGATATTCCGATATGCCGCGGAGATATGATGTTCTGTGTCGATATTCTAGATGCACTAACGAAAGATTTTTTTGCTAGAAAAGGAAATCCTATAGAAGAAACAGCCGAATTAGGTGAAGTTCAACAACGCCCAGACGAAGTTGGTTACGAACCAGTATCATCAACACTTTGGAGGCAGCGTGAAGAGTACTGTGCTCGACTGATACAGCATGCGTGGAAACGCTATAAACAGCGTCACGGAGGCGGAACAGACGCTTCAGGAGATGATCTTGAAATAGATGCCTGTGATAACGGTTGTGGTGGTGGTAATGGCAATGAAAATGATGATAGTGGAGATGGTGCAACAGGTAGTGGTGACAACGGAAGTCAGCATGGTGGTGGCAGCATAAGTGGCGGAGGAGGAACTCCTGGTGGTGGTAAAAGTAAAGGAATTATTGGCAGTACTCAGGCTAACATAGGCATAGTGGATAGTAATATATCACCAAAGGAATCACCGGATAGCATCGGCGATCCCCAAGGTCGTCAGACGGCCGTTCTTGTGGAGAGCGACGGATTTGTGACGAAAAACGGTCACCGTGTCGTCATACACTCTCGATCTCCCAGCATAACATCGCGAACGGCAGATGTCTGA\n",
      "2119 MTEDSDSISEEERSLFRPFTRESLQAIEARIADEEAKQRELERKRAEGEDEDEGPQPDPTLEQGVPVPVRMQGSFPPELASTPLEDIDSFYSNQRTFVVISKGKDIFRFSATNALYVLDPFNPIRRVAIYILVHPLFSLFIITTILVNCILMIMPTTPTVESTEVIFTGIYTFESAVKVMARGFILQPFTYLRDAWNWLDFVVIALAYVTMGIDLGNLAALRTFRVLRALKTVAIVPGLKTIVGAVIESVKNLRDVIILTMFSLSVFALMGLQIYMGVLTQKCIKEFPLDGSWGNLTDESWELFNSNDTNWFYSESGDIPLCGNSSGAGQCDEGYICLQGYGKNPNYGYTSFDTFGWAFLSAFRLMTQDYWENLYQLVLRSAGPWHMLFFIVIIFLGSFYLVNLILAIVAMSYDELQKKAEEEEAAEEEALREAEEAAAAKAAKLEAQQAAAAAAANPEIAKSPSDFSCHSYELFVGQEKGNDDNNKEKMSIRSEGLESVSEITRTTAPTATAAGTAKARKVSAASLSLPGSPFNLRRGSRGSHQFTIRNGRGRFVGVPGSDRKPLVLSTYLDAQEHLPYADDSNAVTPMSEENGAIIVPVYYANLGSRHSSYTSHQSRISYTSHGDLLGGMTKESRLRNRSARNTNHSIVPPPNANNLSYAETNHKGQRDFDLTQDCTDDAGKIKHNDNPFIEPAQTQTVVDMKDVMVLNDIIEQAAGRHSRASDHGVSVYYFPTEDDDEDGPTFKDKALEFLMKMIDIFCVWDCCWVWLKFQEWVAFIVFDPFVELFITLCIVVNTLFMALDHHDMDPDMEKALKSGNYFFTATFAIEATMKLIAMSPKYYFQEGWNIFDFIIVALSLLELGLEGVQGLSVLRSFRLLRVFKLAKSWPTLNLLISIMGRTVGALGNLTFVLCIIIFIFAVMGMQLFGKNYTDNVDRFPDHDLPRWNFTDFMHSFMIVFRVLCGEWIESMWDCMLVGDVSCIPFFLATVVIGNLVVLNLFLALLLSNFGSSSLSAPTADNETNKIAEAFNRISRFSNWIKMNLANALKFVKNKLTSQIASVQPTGKGVCPCISSEHGENELELTPDDILADGLLKKGIKEHNQLEVAIGDGMEFTIHGDLKNKAKKNKQIMNNSKVIGNSISNHQDNKLDHELNHRGVSLQDDDTASIKSYGSHKNRPFKDESHKGSAETMEGEEKRDASKEDLGIDEELDDEGEGDEGPLDGELIIHAEEDEVIEDSPADCCPDNCYKKFPVLAGDDDAPFWQGWGNLRLKTFQLIENKYFETAVITMILLSSLALALEDVHLPQRPILQDILYYMDRIFTVIFFLEMLIKWLALGFKVYFTNAWCWLDFIIVMVSLINFVASLCGAGGIQAFKTMRTLRALRPLRAMSRMQGMRVVVNALVQAIPSIFNVLLVCLIFWLIFAIMGVQLFAGKYFKCVDKNKTTLPHEIIPDVNACKAENYSWENSPMNFDHVGKAYLCLFQVATFKGWIQIMNDAIDSRDVSFVGKQPIRETNIYMYLYFVFFIIFGSFFTLNLFIGVIIDNFNEQKKKAGGSLEMFMTEDQKKYYNAMKKMGSKKPLKAIPRPRWRPQAIVFEIVTNKKFDMIIMLFIGFNMLTMTLDHYKQSETFSAVLDYLNMIFICIFSSECLMKIFALRYHYFIEPWNLFDFVVVILSILGLVLSDIIEKYFVSPTLLRVVRVAKVGRVLRLVKGAKGIRTLLFALAMSLPALFNICLLLFLVMFIFAIFGMSFFMHVKDKSGLDDVYNFKTFGQSMILLFQMSTSAGWDGVLDGIINEEDCLPPDNDKGYPGNCGSSTIGITYLLAYLVISFLIVINMYIAVILENYSQATEDVQEGLTDDDYDMYYEIWQQFDPDGTQYVRYDQLSDFLDVLEPPLQIHKPNRYKIISMDIPICRGDMMFCVDILDALTKDFFARKGNPIEETAELGEVQQRPDEVGYEPVSSTLWRQREEYCARLIQHAWKRYKQRHGGGTDASGDDLEIDACDNGCGGGNGNENDDSGDGATGSGDNGSQHGGGSISGGGGTPGGGKSKGIIGSTQANIGIVDSNISPKESPDSIGDPQGRQTAVLVESDGFVTKNGHRVVIHSRSPSITSRTADV*\n"
     ]
    }
   ],
   "source": [
    "mRNAs = db.children(gene, featuretype='mRNA')\n",
    "for mRNA in mRNAs:\n",
    "    print(mRNA.id)\n",
    "    if mRNA.id.endswith('RA'):\n",
    "        break\n",
    "\n",
    "CDSs = db.children(mRNA, featuretype='CDS', order_by='start')\n",
    "gene_seq = get_sequence(my_seq, CDSs, gene.strand)\n",
    "\n",
    "print(len(gene_seq), gene_seq)\n",
    "prot = gene_seq.translate()\n",
    "print(len(prot), prot)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Reverse strand"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "reverse_transcript_id = 'AGAP004708-RA'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1992 ATGGCTGACTTCGATAGTGCCACTAAATGTATCAGAAACATTGAAAAAGAAATTCTTCTCTTGCAATCCGAAGTTTTGAAGACTCGTGAGGGGCTTGGGCTGGAAGATGATAACGTGGAACTTAAAAAGTTAATGGAGGAAAACACGAGATTAAAGCATCGTTTGGAGATAGTGCAATCGGCTATTGTACAGGAAGGCGGATCAATCGCATCCTCCGATTCTGGCAACCAATCCATTGTTGGCGAACTGCAGCAAGTATTTACCGAAGCCATTCAAAAAGCTTTTCCAAGTGTGTTGGTTGAGGCGGTTATTACTATTTCGTCATCCCCCAAGTTTGGCGATTATCAATGCAATAGTGCTATGCAGATTGCGCAGCATTTGAAGCAGTTATCTGTTAAATCGTCGCCACGTGAAGTGGCCCAAAAACTGGTAGCTGAATTGCAAAAACCAATACCTTGTGTCGATAGATTAGAAATCGCTGGAGCGGGATACGTTAATATTTTCCTGTCTAGATCTTATGGAGAACAACGCATTATGAGCATCTTGAGGCATGGGATTGTGGTACCATTAATAGAAAAGAAACGTGTGATAGTCGATTTTTCCTCGCCTAACGTAGCGAAAGAAATGCATGTCGGTCATTTACGTTCGACCATCATTGGTGATTCAATTTGTCGATTTTTGGAATATCTCGGACACGATGTGCTTCGTATTAACCATATCGGAGACTGGGGAACGCAATTTGGTATGTTAATTGCTCATTTGCAGGACCGTTTCCCTAATTTCCAAACCGAGTCCCCGCCTATCAGCGATTTGCAAGCATTTTACAAGGAGTCAAAGGTCCGATTTGACAGCGATGAAGTATTTAAAAAGCGTGCCTACGAATGTGTAGTCAAACTGCAAAGTGGAGAGCTGAGTTATTTGAAGGCCTGGAATCTAATTTGCGATGTTTCACGCAAAGAATTCCAAACCATCTACAACAGATTGGATGTGAAACTAGTTGAACGTGGTGAATCGTTTTATCAAAGCAGAATGGAAAAAATCGTAGAAGAACTTAAGCAGGATGGGTTCCTTGAAGAAGACGAAGGCCGTCTTATCATGTGGGGCGAAAATCGCGCTGGAATTCCTTTAACCATCGTAAAATCAGACGGAGGATTTACATATGATACTTCGGATATGGCCGCCATCAAACAACGCTTGCAAGAAGAAAAGGCTGATTGGTTGATATATGTAACTGACGCTGGGCAGGCGACTCATTTCCAAACAATTTTTTCTTGTGCAAAACGAGCCAAAATCCTACAAGAGAGCAAACATCGTGTGGATCACGTCGGATTTGGTGTGGTGCTAGGCGAAGATGGTAAAAAATTCAAGACTCGTTCTGGCGATACGGTGAAATTGACAGAACTTCTCAATGAAGGTTTGAGGAGGGCTATGGAAAAACTAGTTCAGAAGGAAAGGAACTTAGTGCTCACACAAGAGGAGCTAGTTGCAGCACAAGAATCAGTCGCCTACGGTTGTATTAAATATGCGGATCTGTCGCATAATCGTAACAACGAATATGTGTTCTCCTTCGATAAGATGCTGGAGGACAAAGGAAATACTGCCGTGTATCTGTTGTATGCCTATACCCGCATTCGTTCTATTGCAAGAAAATGTGGCGGAGATTTTGCAAATGACATGCAAAAGGTGATCGATTCCACAGTTATTAAATTAGATCATGAAAAGGAATGGAAACTCGCCAAGGTGTTGCTTCGTTTTACCGACGTTATGTTATTGATCATGAAAAATCTATCGTTACATCATCTTTGTGAGTTTGTGTACGAAATATGCACTGCTTTTAGTGAGTTTTATGACAGTTGTTATTGCATCGAAAAAAATAAGCAAGGTGAAATTATTACTGTTTATCCCTCTCGCGTCTTGCTATGCGAAGCAACATCAAAGGTGCTGGAAAAATGTTTCGATATTTTAGGACTGAAGCCTGTGCATAAAATATAA\n",
      "664 MADFDSATKCIRNIEKEILLLQSEVLKTREGLGLEDDNVELKKLMEENTRLKHRLEIVQSAIVQEGGSIASSDSGNQSIVGELQQVFTEAIQKAFPSVLVEAVITISSSPKFGDYQCNSAMQIAQHLKQLSVKSSPREVAQKLVAELQKPIPCVDRLEIAGAGYVNIFLSRSYGEQRIMSILRHGIVVPLIEKKRVIVDFSSPNVAKEMHVGHLRSTIIGDSICRFLEYLGHDVLRINHIGDWGTQFGMLIAHLQDRFPNFQTESPPISDLQAFYKESKVRFDSDEVFKKRAYECVVKLQSGELSYLKAWNLICDVSRKEFQTIYNRLDVKLVERGESFYQSRMEKIVEELKQDGFLEEDEGRLIMWGENRAGIPLTIVKSDGGFTYDTSDMAAIKQRLQEEKADWLIYVTDAGQATHFQTIFSCAKRAKILQESKHRVDHVGFGVVLGEDGKKFKTRSGDTVKLTELLNEGLRRAMEKLVQKERNLVLTQEELVAAQESVAYGCIKYADLSHNRNNEYVFSFDKMLEDKGNTAVYLLYAYTRIRSIARKCGGDFANDMQKVIDSTVIKLDHEKEWKLAKVLLRFTDVMLLIMKNLSLHHLCEFVYEICTAFSEFYDSCYCIEKNKQGEIITVYPSRVLLCEATSKVLEKCFDILGLKPVHKI*\n"
     ]
    }
   ],
   "source": [
    "reverse_CDSs = db.children(reverse_transcript_id, featuretype='CDS', order_by='start')\n",
    "reverse_seq = get_sequence(my_seq, reverse_CDSs, '-')\n",
    "\n",
    "print(len(reverse_seq), reverse_seq)\n",
    "reverse_prot = reverse_seq.translate()\n",
    "print(len(reverse_prot), reverse_prot)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.11"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
